/* $NetBSD: bcopy.S,v 1.1 2005/12/20 19:28:49 christos Exp $ */

/*
 * Copyright (c) 1995 Carnegie-Mellon University.
 * All rights reserved.
 *
 * Author: Trevor Blackwell.  Support for use as memcpy() and memmove()
 *	   added by Chris Demetriou.
 *
 * Permission to use, copy, modify and distribute this software and
 * its documentation is hereby granted, provided that both the copyright
 * notice and this permission notice appear in all copies of the
 * software, derivative works or modified versions, and any portions
 * thereof, and that both notices appear in supporting documentation.
 *
 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
 *
 * Carnegie Mellon requests users of this software to return to
 *
 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
 *  School of Computer Science
 *  Carnegie Mellon University
 *  Pittsburgh PA 15213-3890
 *
 * any improvements or extensions that they make and grant Carnegie the
 * rights to redistribute these changes.
 */

#include <machine/asm.h>

#if defined(MEMCOPY) || defined(MEMMOVE)
#ifdef MEMCOPY
#define	FUNCTION	memcpy
#else
#define FUNCTION	memmove
#endif
#define	SRCREG		a1
#define	DSTREG		a0
#else /* !(defined(MEMCOPY) || defined(MEMMOVE)) */
#define	FUNCTION	bcopy
#define	SRCREG		a0
#define	DSTREG		a1
#endif /* !(defined(MEMCOPY) || defined(MEMMOVE)) */

#define	SIZEREG		a2

/*
 * Copy bytes.
 *
 * void bcopy(char *from, char *to, size_t len);
 * char *memcpy(void *to, const void *from, size_t len);
 * char *memmove(void *to, const void *from, size_t len);
 *
 * No matter how invoked, the source and destination registers
 * for calculation.  There's no point in copying them to "working"
 * registers, since the code uses their values "in place," and
 * copying them would be slower.
 */

LEAF(FUNCTION,3)

#if defined(MEMCOPY) || defined(MEMMOVE)
	/* set up return value, while we still can */
	mov	DSTREG,v0
#endif

	/* Check for negative length */
	ble	SIZEREG,bcopy_done

	/* Check for overlap */
	subq	DSTREG,SRCREG,t5
	cmpult	t5,SIZEREG,t5
	bne	t5,bcopy_overlap

	/* a3 = end address */
	addq	SRCREG,SIZEREG,a3

	/* Get the first word */
	ldq_u	t2,0(SRCREG)

	/* Do they have the same alignment? */
	xor	SRCREG,DSTREG,t0
	and	t0,7,t0
	and	DSTREG,7,t1
	bne	t0,bcopy_different_alignment

	/* src & dst have same alignment */
	beq	t1,bcopy_all_aligned

	ldq_u	t3,0(DSTREG)
	addq	SIZEREG,t1,SIZEREG
	mskqh	t2,SRCREG,t2
	mskql	t3,SRCREG,t3
	or	t2,t3,t2

	/* Dst is 8-byte aligned */

bcopy_all_aligned:
	/* If less than 8 bytes,skip loop */
	subq	SIZEREG,1,t0
	and	SIZEREG,7,SIZEREG
	bic	t0,7,t0
	beq	t0,bcopy_samealign_lp_end

bcopy_samealign_lp:
	stq_u	t2,0(DSTREG)
	addq	DSTREG,8,DSTREG
	ldq_u	t2,8(SRCREG)
	subq	t0,8,t0
	addq	SRCREG,8,SRCREG
	bne	t0,bcopy_samealign_lp

bcopy_samealign_lp_end:
	/* If we're done, exit */
	bne	SIZEREG,bcopy_small_left
	stq_u	t2,0(DSTREG)
	RET

bcopy_small_left:
	mskql	t2,SIZEREG,t4
	ldq_u	t3,0(DSTREG)
	mskqh	t3,SIZEREG,t3
	or	t4,t3,t4
	stq_u	t4,0(DSTREG)
	RET

bcopy_different_alignment:
	/*
	 * this is the fun part
	 */
	addq	SRCREG,SIZEREG,a3
	cmpule	SIZEREG,8,t0
	bne	t0,bcopy_da_finish

	beq	t1,bcopy_da_noentry

	/* Do the initial partial word */
	subq	zero,DSTREG,t0
	and	t0,7,t0
	ldq_u	t3,7(SRCREG)
	extql	t2,SRCREG,t2
	extqh	t3,SRCREG,t3
	or	t2,t3,t5
	insql	t5,DSTREG,t5
	ldq_u	t6,0(DSTREG)
	mskql	t6,DSTREG,t6
	or	t5,t6,t5
	stq_u	t5,0(DSTREG)
	addq	SRCREG,t0,SRCREG
	addq	DSTREG,t0,DSTREG
	subq	SIZEREG,t0,SIZEREG
	ldq_u	t2,0(SRCREG)

bcopy_da_noentry:
	subq	SIZEREG,1,t0
	bic	t0,7,t0
	and	SIZEREG,7,SIZEREG
	beq	t0,bcopy_da_finish2

bcopy_da_lp:
	ldq_u	t3,7(SRCREG)
	addq	SRCREG,8,SRCREG
	extql	t2,SRCREG,t4
	extqh	t3,SRCREG,t5
	subq	t0,8,t0
	or	t4,t5,t5
	stq	t5,0(DSTREG)
	addq	DSTREG,8,DSTREG
	beq	t0,bcopy_da_finish1
	ldq_u	t2,7(SRCREG)
	addq	SRCREG,8,SRCREG
	extql	t3,SRCREG,t4
	extqh	t2,SRCREG,t5
	subq	t0,8,t0
	or	t4,t5,t5
	stq	t5,0(DSTREG)
	addq	DSTREG,8,DSTREG
	bne	t0,bcopy_da_lp

bcopy_da_finish2:
	/* Do the last new word */
	mov	t2,t3

bcopy_da_finish1:
	/* Do the last partial word */
	ldq_u	t2,-1(a3)
	extql	t3,SRCREG,t3
	extqh	t2,SRCREG,t2
	or	t2,t3,t2
	br	zero,bcopy_samealign_lp_end

bcopy_da_finish:
	/* Do the last word in the next source word */
	ldq_u	t3,-1(a3)
	extql	t2,SRCREG,t2
	extqh	t3,SRCREG,t3
	or	t2,t3,t2
	insqh	t2,DSTREG,t3
	insql	t2,DSTREG,t2
	lda	t4,-1(zero)
	mskql	t4,SIZEREG,t5
	cmovne	t5,t5,t4
	insqh	t4,DSTREG,t5
	insql	t4,DSTREG,t4
	addq	DSTREG,SIZEREG,a4
	ldq_u	t6,0(DSTREG)
	ldq_u	t7,-1(a4)
	bic	t6,t4,t6
	bic	t7,t5,t7
	and	t2,t4,t2
	and	t3,t5,t3
	or	t2,t6,t2
	or	t3,t7,t3
	stq_u	t3,-1(a4)
	stq_u	t2,0(DSTREG)
	RET

bcopy_overlap:
	/*
	 * Basically equivalent to previous case, only backwards.
	 * Not quite as highly optimized
	 */
	addq	SRCREG,SIZEREG,a3
	addq	DSTREG,SIZEREG,a4

	/* less than 8 bytes - don't worry about overlap */
	cmpule	SIZEREG,8,t0
	bne	t0,bcopy_ov_short

	/* Possibly do a partial first word */
	and	a4,7,t4
	beq	t4,bcopy_ov_nostart2
	subq	a3,t4,a3
	subq	a4,t4,a4
	ldq_u	t1,0(a3)
	subq	SIZEREG,t4,SIZEREG
	ldq_u	t2,7(a3)
	ldq	t3,0(a4)
	extql	t1,a3,t1
	extqh	t2,a3,t2
	or	t1,t2,t1
	mskqh	t3,t4,t3
	mskql	t1,t4,t1
	or	t1,t3,t1
	stq	t1,0(a4)

bcopy_ov_nostart2:
	bic	SIZEREG,7,t4
	and	SIZEREG,7,SIZEREG
	beq	t4,bcopy_ov_lp_end

bcopy_ov_lp:
	/* This could be more pipelined, but it doesn't seem worth it */
	ldq_u	t0,-8(a3)
	subq	a4,8,a4
	ldq_u	t1,-1(a3)
	subq	a3,8,a3
	extql	t0,a3,t0
	extqh	t1,a3,t1
	subq	t4,8,t4
	or	t0,t1,t0
	stq	t0,0(a4)
	bne	t4,bcopy_ov_lp

bcopy_ov_lp_end:
	beq	SIZEREG,bcopy_done

	ldq_u	t0,0(SRCREG)
	ldq_u	t1,7(SRCREG)
	ldq_u	t2,0(DSTREG)
	extql	t0,SRCREG,t0
	extqh	t1,SRCREG,t1
	or	t0,t1,t0
	insql	t0,DSTREG,t0
	mskql	t2,DSTREG,t2
	or	t2,t0,t2
	stq_u	t2,0(DSTREG)

bcopy_done:
	RET

bcopy_ov_short:
	ldq_u	t2,0(SRCREG)
	br	zero,bcopy_da_finish

	END(FUNCTION)
